# !sudo pip install streamlit==1.24.0
# !sudo pip install torch
# !sudo pip install omnixai
# !sudo pip install --upgrade ipython
# !sudo pip install plotly
One of the leading retail stores in the US, ABC, would like to predict the customers and demand accurately. There are certain events and holidays which impact sales on each day. There are customers data available for 51 stores of ABC. The business is facing a challenge due to unforeseen demands and runs out of stock sometimes, due to the inappropriate machine learning algorithm. An ideal ML algorithm will predict demand accurately and ingest factors like promotions, state and school holidays conditions. ABC runs several promotional markdown events throughout the year. These markdowns precede prominent holidays, the four largest of all, which are the Super Bowl, Labour Day, Thanksgiving, and Christmas. Part of the challenge presented by this competition is modelling the effects of markdowns on these holiday weeks in the absence of complete/ideal historical data. Historical data for 51 stores located in different regions are availab le. The dataset contains 51 stores’ customers data along with other details like day of the week, store id, promotion and holidays details from Jan 2014 to2016.
Date : Date
Store:Store id
Day of Week: Day of the week (encoded):
1: Sunday 2: Monday… 7: Saturday
Customers: Number of customers on a given day
Open : An indicator for whether the store is open:
0: Close; 1: Open
Promotion: An Indicator whether a store is running a promotion on given day. 0: No Promotion; 1: Promotion
State Holiday: Indicates a state holiday; Normally all stores, with few exceptions, are closed on state holidays.
0: No state holiday; ‘a’, ’b, ’c’: different types of holidays
School Holiday: An indicator if the (Store, Date) was affected by the closure of public schools
Initial Guidelines:
Utilize software engineering aspects while building Machine learning model using modular programming principles to organize your code into reusable functions or classes to enhance readability, maintainability, and collaboration.
# code starts here and # code ends hereno error while executing the notebook. If there are any error causing code, please comment it.Python 3 (ipykernel) if not set alreadyNote section will provide you hints to solve the problem.PRINT statement inside the Except Block. Please use return statement only within the except blockImport various libraries and modules used in data analysis, machine learning, and visualization tasks in Python such as pandas, numpy, sklearn, matplotlib, datetime , timeit, seaborn, statsmodels,statsmodels.tsa.arima.model statsmodels.tsa.stattools,statsmodels.api , sklearn.model_selection, sklearn.linear_model, sklearn.metrics, sklearn.preprocessing. There are 2 ways to import the libraries and modules:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
import itertools
import warnings
import pickle
return statement onlyStore50 variable is defined to store the dataframereturn Store50 should return the dataframe.def load_the_dataset(dataset_location):
# code starts here
Store50 = None
try:
#load data set
Store50 = pd.read_excel(dataset_location, parse_dates = True, index_col = 'Date')
return Store50
except :
return "File not found. Please check the file"
# code ends here
# store the result of the dataset
dataset_location = "store50.xlsx"
Store50=load_the_dataset(dataset_location)
Store50.head()
| Store | Day Of Week | Customers | Open | Promotion | State Holiday | School Holiday | |
|---|---|---|---|---|---|---|---|
| Date | |||||||
| 2016-01-01 | 1036.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0 | 1.0 |
| 2016-01-01 | 1027.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0 | 1.0 |
| 2016-01-01 | 1004.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0 | 1.0 |
| 2016-01-01 | 1014.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0 | 1.0 |
| 2016-01-02 | 1044.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 |
Store50.shape
(43326, 7)
#check missing values
Store50.isna().sum()
Store 1530 Day Of Week 1530 Customers 1530 Open 1530 Promotion 1530 State Holiday 1530 School Holiday 1530 dtype: int64
#percentage of missing values
Store50.isna().mean() * 100
Store 3.531367 Day Of Week 3.531367 Customers 3.531367 Open 3.531367 Promotion 3.531367 State Holiday 3.531367 School Holiday 3.531367 dtype: float64
unique_store should retrieve the number of unique values of the Store column from the DataFrame Store50.Store column from the DataFrame Store50 and then apply the unique() method to it and store it in the defined variable uniq.return len(uniq) returns an integer of number of unique store values are there in the 'Store' column.## List the number of unique stores for which we have the data?
def unique_store(Store50):
# code starts here
uniq = None
#find the number of stores
uniq = Store50['Store'].unique()
return len(uniq)
# code ends here
unique_store(Store50)
52
no_obs_store should return the number of observations for each store stored together in the form of a dictionarystore_countsdef no_obs_store(Store50):
# code starts here
store_counts = {}
#get the unique values of the store column
unique_stores = Store50['Store'].unique()
# Iterate over each unique store value
for store in unique_stores:
store_observations = Store50[Store50['Store'] == store]
# Store the len of observations in the dictionary
store_counts[store] = len(store_observations)
return store_counts
# code ends here
no_obs_store(Store50)
{1036.0: 852,
1027.0: 852,
1004.0: 852,
1014.0: 852,
1044.0: 852,
1041.0: 852,
1047.0: 852,
1012.0: 852,
1037.0: 668,
1005.0: 852,
1011.0: 852,
1043.0: 852,
1001.0: 668,
1048.0: 852,
1030.0: 852,
1035.0: 852,
1013.0: 852,
1029.0: 852,
1042.0: 852,
1017.0: 668,
1032.0: 852,
1024.0: 852,
1006.0: 668,
1007.0: 852,
1002.0: 668,
1019.0: 852,
1000.0: 668,
1009.0: 852,
1023.0: 668,
1016.0: 852,
1050.0: 852,
1031.0: 852,
1010.0: 852,
1034.0: 852,
1039.0: 852,
1033.0: 852,
1045.0: 852,
1015.0: 852,
1022.0: 852,
1040.0: 852,
1046.0: 852,
1021.0: 852,
1028.0: 852,
1008.0: 852,
1025.0: 668,
1020.0: 852,
1038.0: 668,
1003.0: 852,
1018.0: 852,
1026.0: 852,
1049.0: 852,
nan: 0}
less_obs_stores should return the list of store ids which are having number of observations less than 852.Store column from the DataFrame Store50 and then applies the unique() method to itless_obs_stores_list.def less_obs_stores(Store50):
# code starts here
less_obs_stores_list = []
#get the unique values of the store column
unique_stores = Store50['Store'].unique()
# Iterate over each unique store value
for store in unique_stores:
store_observations = Store50[Store50['Store'] == store]
# count the observations per each store
count_observations = len(store_observations)
if count_observations < 852:
less_obs_stores_list.append(store)
return less_obs_stores_list
# code ends here
less_obs_stores(Store50)
[1037.0, 1001.0, 1017.0, 1006.0, 1002.0, 1000.0, 1023.0, 1025.0, 1038.0, nan]
avg_cust_store should return the average number of customers for two specific stores (Store 1044 and Store 1041) within the Store50 DataFrame.mean_1044 and mean_1041 to store the mean values of Store 1044 and Store 1041.# Here, we are also reversing the data as we need to contiguous data in the ascending order for a Time Series Analysis.
def avg_cust_store(Store50):
# code starts here
mean_1041 , mean_1044 = None, None
#sort the data
sort_Store50 = Store50.sort_index(ascending = True)
#filter the data for each store id
store_1041 = sort_Store50[sort_Store50['Store'] == 1041]
store_1044 = sort_Store50[sort_Store50['Store'] == 1044]
#compute the mean and round to nearest integer
mean_1041 = round(store_1041['Customers'].mean())
mean_1044 = round(store_1044['Customers'].mean())
return mean_1044 , mean_1041
# code ends here
avg_cust_store(Store50)
(474, 700)
time_series_plot should return a dataframe df where it stores the data of number of customers per each date availble in the dataset for Store id 1044 and Store id 1041.Store50 DataFrame .df with both the columns with names Store_1041 and Store_1044 and structure it with columns for each store's Customers data.| Store_1041 | Store_1044 | |
|---|---|---|
| Date | ||
| 2014-01-01 | 0.0 | 0.0 |
def time_series_plot(Store50):
# code starts here
df = None
#sort the data
sort_Store50 = Store50.sort_index(ascending = True)
#filter the data for each store id
store_1041_cust_info = sort_Store50[sort_Store50['Store'] == 1041]["Customers"]
store_1044_cust_info = sort_Store50[sort_Store50['Store'] == 1044]["Customers"]
#create a dataframe
df = pd.DataFrame({'Store_1041': store_1041_cust_info, 'Store_1044': store_1044_cust_info})
return df
# code ends here
df = time_series_plot(Store50)
from pylab import rcParams
rcParams['figure.figsize'] = 15,8
## Resample the data into a weekly time series to understand how the number of customers change weekly.
df.resample('W').sum().plot(figsize=(8, 4))
plt.title("Weekly Change of Customers for Store 1041 and Store 1044")
plt.xlabel("Date")
plt.ylabel("Number of Customers")
plt.legend()
plt.show()
## Resample the data into a monthly time series to understand how the number of the customers change monthy?
df.resample('M').sum().plot(figsize=(8, 4))
plt.title("Monthly Change of Customers for Store 1041 and Store 1044")
plt.xlabel("Date")
plt.ylabel("Number of Customers")
plt.legend()
plt.show()
store_id_max_weekly for weekly customers aggregation and second function store_id_max_monthly for monthly customers aggregationStore_####¶def store_id_max_weekly(df):
# code starts here
max_id = None
#resample the data to weekly and take aggregation
weekly_agg = df.resample('W').sum()
# use the previous dataframe
if weekly_agg['Store_1041'].mean() > weekly_agg['Store_1044'].mean():
max_id = "Store_1041"
else:
max_id = "Store_1044"
return max_id
# code ends here
store_id_max_weekly(df)
'Store_1041'
def store_id_max_monthly(df):
# code starts here
max_id = None
#resample the data to monthly and take aggregation
monthly_agg = df.resample('M').sum()
# use the previous dataframe
if monthly_agg['Store_1041'].mean() > monthly_agg['Store_1044'].mean():
max_id = "Store_1041"
else:
max_id = "Store_1044"
return max_id
# code ends here
store_id_max_monthly(df)
'Store_1041'
weekly_1041 will return the weekly sum for store id 1041.resample() to calculate the weekly sum for the dataframe df and drop the Store_1044 to get the dataframe for weekly 1041.Use axis=1, inplace=Truedf_weekly_1041# weekly sum for store_1044
def weekly_1041(df):
df_weekly_1041 = None
#code starts here
df_weekly_1041 = df.resample('W').sum().drop('Store_1044', axis = 1)
# code ends here
return df_weekly_1041
df_weekly_1041 = weekly_1041(df)
weekly_1044 will return the weekly sum for store id 1041.resample() to calculate the weekly sum for the dataframe df and drop the Store_1041 to get the dataframe for weekly 1044.Use axis=1, inplace=Truedf_weekly_1044def weekly_1044(df):
df_weekly_1044 = None
#code starts here
df_weekly_1044 = df.resample('W').sum().drop('Store_1041', axis = 1)
# code ends here
return df_weekly_1044
df_weekly_1044 = weekly_1044(df)
# save the into csv file
df_weekly_1041.to_csv('weekly_1041.csv')
df_weekly_1044.to_csv('weekly_1044.csv')
Are they both exhibit stationary? State your results of the Augmented Dicky Fuller(ADF) test
Print your output in the given format :
Data is stationary
Data is not stationary
adf_test should be designed for analyzing time series data using the Augmented Dickey-Fuller (ADF) test, a statistical test used to determine if a time series is stationary or not.0.05 then return data is stationary or else return the data is non-stationaryChecking for stationarity of data using ADF test function:
## Defning a function
def adf_test(timeseries):
# code starts here
#compute the rolling mean and standard deviation with the window of 7
rolling_mean = timeseries.rolling(window=7).mean()
rolling_std = timeseries.rolling(window=7).std()
# Comparing the original time series data along with rolling mean and std
plt.figure(figsize=(7, 4))
plt.plot(timeseries, color='blue', label='Original Data')
plt.plot(rolling_mean, color='red', label='Rolling Mean')
plt.plot(rolling_std, color='green', label='Rolling Std')
plt.title('Original Data with Rolling Statistics')
plt.xlabel("Date")
plt.xticks(rotation = 45)
plt.legend()
plt.show()
# Perform the Augmented Dickey-Fuller (ADF) test
adf_result = adfuller(timeseries, regression="ct")
# Extract and print the ADF test results
adf_statistic = adf_result[0]
p_value = adf_result[1]
print(f'ADF Statistic: {adf_statistic}')
print(f'p-value: {p_value}')
print("Critical values")
for key, value in adf_result[4].items():
temp_key = key
temp_value = value
print(f"\t{key}:{value}")
# Determine if the data is stationary or non-stationary based on the p-value
if p_value <= 0.05:
return 'Data is stationary'
else:
return 'Data is not stationary'
# code ends here
#checking stationary for the store 1041 weekly aggregated data
adf_test(df_weekly_1041)
ADF Statistic: -3.451807717084272 p-value: 0.04484114760577085 Critical values 1%:-4.034959588225446 5%:-3.44710896788718 10%:-3.1485194734361026
'Data is stationary'
The Augmented Dickey-Fuller (ADF) test is a statistical test used to check if a time series is stationary, or has a constant mean and variance over time. The null hypothesis of the ADF test is that the time series is non-stationary, meaning it has a unit root. If the p-value of the test is less than a specified significance level (typically 0.05), we reject the null hypothesis and conclude that the time series is stationary.
The weekly aggregated store data for store 1041 is stationary. This is because the p-value is less than the significance level of 0.05, indicating that we reject the null hypothesis that the time series has a unit root, and therefore, the time series is stationary.
#checking stationary for the store 1044
adf_test(df_weekly_1044)
ADF Statistic: -3.4428466408668172 p-value: 0.04592102999977229 Critical values 1%:-4.034959588225446 5%:-3.44710896788718 10%:-3.1485194734361026
'Data is stationary'
train_test_split_1041 should be designed to split the time series data df_weekly_1041 into training and testing sets.train_test_split_1044 should be designed to split the time series data df_weekly_1044 into training and testing sets.def train_test_split_1041(df_weekly_1041):# Split the data into train and test ( consider year 2016 for test data)
train_1041,test_1041 = None, None
#code starts here
train_1041 = df_weekly_1041[df_weekly_1041.index.year != 2016]
test_1041 = df_weekly_1041[df_weekly_1041.index.year == 2016]
# code ends here
return train_1041,test_1041
train_1041=train_test_split_1041(df_weekly_1041)[0]
test_1041=train_test_split_1041(df_weekly_1041)[1]
def train_test_split_1044(df_weekly_1044):# Split the data into train and test ( consider year 2016 for test data)
train_1044,test_1044 = None,None
# code starts here
train_1044 = df_weekly_1044[df_weekly_1044.index.year != 2016]
test_1044 = df_weekly_1044[df_weekly_1044.index.year == 2016]
# code ends here
return train_1044,test_1044
train_1044=train_test_split_1044(df_weekly_1044)[0]
test_1044=train_test_split_1044(df_weekly_1044)[1]
## Auto correlation plot for weekly aggregation data of Store 1041
plt.figure(figsize=(12, 8))
plot_acf(train_1041, lags=20)
plt.title("ACF Plot for Store 1041")
plt.show()
<Figure size 1200x800 with 0 Axes>
## Auto correlation plot for weekly aggregation data of Store 1044
plt.figure(figsize=(12, 8))
plot_acf(train_1044, lags=20)
plt.title("ACF Plot for Store 1044")
plt.show()
<Figure size 1200x800 with 0 Axes>
## Partial Auto correlation plot for weekly aggregation data of Store 1041
plt.figure(figsize=(12, 8))
plot_pacf(train_1041, lags=20)
plt.title("PACF Plot for Store 1041")
plt.show()
<Figure size 1200x800 with 0 Axes>
## Partial Auto correlation plot for weekly aggregation data of Store 1044
plt.figure(figsize=(12, 8))
plot_pacf(train_1044, lags=20)
plt.title("PACF Plot for Store 1044")
plt.show()
<Figure size 1200x800 with 0 Axes>
itertools.product#set up grid search option
p = range(1, 5)
d = range(0, 1) #since the data is stationary, no need for differencing
q = range(1, 6)
#seaonality parameters
s = [7] #weekly data
P = Q = range(0, 3)
D = range(0, 1) #since the data is stationary, no need for differencing
pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(P, D, Q, s))
param and AIC¶warnings.filterwarnings("ignore")
def find_best_arima_params(train_data):
"""
Find the best p, d, q values for the ARIMA model based on the least AIC values.
Parameters:
train_data (DataFrame): Training data for the time series.
Returns:
tuple: The best p, d, q values for the ARIMA model
"""
# Define the parameter grids
p = range(1, 5)
d = range(0, 1) #since the data is stationary, no need for differencing
q = range(1, 6)
pdq = list(itertools.product(p, d, q))
# Create an empty DataFrame to store the results
ARIMA_AIC = pd.DataFrame(columns=['param','AIC'])
# Fit ARIMA models and calculate AIC values
for param in pdq:
try:
model = ARIMA(train_data, order=param)
results = model.fit()
ARIMA_AIC = ARIMA_AIC.append({'param': param, 'AIC': results.aic}, ignore_index=True)
print(f'ARIMA{param} - AIC: {results.aic}')
except Exception as e:
print(e)
continue
# Find the best parameter combination
best_params = ARIMA_AIC.loc[ARIMA_AIC['AIC'].idxmin()]
return (best_params['param'])
# Example usage:
# train_1041, test_1041 = train_test_split_1041(df_weekly_1041)
# best_params = find_best_arima_params(train_1041, seasonal_period=7) # Assuming weekly data
# print(f"Best ARIMA Model: ARIMA{best_params[0]}x{best_params[1]}{best_params[2]}")
#finding best params for arima model
best_params = find_best_arima_params(train_1041)
ARIMA(1, 0, 1) - AIC: 1696.4863942067586 ARIMA(1, 0, 2) - AIC: 1698.1846183240498 ARIMA(1, 0, 3) - AIC: 1699.697015327729 ARIMA(1, 0, 4) - AIC: 1701.442307522423 ARIMA(1, 0, 5) - AIC: 1703.0575565630447 ARIMA(2, 0, 1) - AIC: 1698.1866400983952 ARIMA(2, 0, 2) - AIC: 1696.353241969533 ARIMA(2, 0, 3) - AIC: 1695.1009703597329 ARIMA(2, 0, 4) - AIC: 1698.9568040459285 ARIMA(2, 0, 5) - AIC: 1698.8631807529018 ARIMA(3, 0, 1) - AIC: 1699.6795606875137 ARIMA(3, 0, 2) - AIC: 1697.4691304444832 ARIMA(3, 0, 3) - AIC: 1700.109843589867 ARIMA(3, 0, 4) - AIC: 1699.4197954613526 ARIMA(3, 0, 5) - AIC: 1701.4468628181712 ARIMA(4, 0, 1) - AIC: 1701.473371461097 ARIMA(4, 0, 2) - AIC: 1696.8358057795535 ARIMA(4, 0, 3) - AIC: 1699.0902900870474 ARIMA(4, 0, 4) - AIC: 1701.1620101324415 ARIMA(4, 0, 5) - AIC: 1700.160636036937
best_params
print(f"Best ARIMA Model: ARIMA - {best_params[0]}x{best_params[1]}x{best_params[2]}")
Best ARIMA Model: ARIMA - 2x0x3
warnings.filterwarnings("ignore")
def find_best_arima_params(train_data, seasonal_period):
"""
Find the best p, d, q, P, D, Q values for the ARIMA model with seasonality based on the least AIC values.
Parameters:
train_data (DataFrame): Training data for the time series.
seasonal_period (int): The number of periods in a season (e.g., 12 for monthly data).
Returns:
tuple: The best p, d, q, P, D, Q values for the ARIMA model with seasonality.
"""
# Define the parameter grids
p = range(0, 4)
d = range(0, 1) #since the data is stationary, no need for differencing
q = range(0, 5)
P = range(0, 4)
D = range(0, 1)#since the data is stationary, no need for differencing
Q = range(0, 5)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(P, D, Q))
# Create an empty DataFrame to store the results
ARIMA_AIC = pd.DataFrame(columns=['param', 'seasonal_param', 'AIC'])
# Fit ARIMA models with seasonality and calculate AIC values
for param in pdq:
for seasonal_param in seasonal_pdq:
try:
model = SARIMAX(train_data, order=param, seasonal_order=(*seasonal_param, seasonal_period))
results = model.fit(disp=False)
ARIMA_AIC = ARIMA_AIC.append({'param': param, 'seasonal_param': seasonal_param, 'AIC': results.aic}, ignore_index=True)
print(f'ARIMA{param}x{seasonal_param}{seasonal_period} - AIC: {results.aic}')
except:
continue
# Display the results
print(ARIMA_AIC)
# Find the best parameter combination
best_params = ARIMA_AIC.loc[ARIMA_AIC['AIC'].idxmin()]
return (best_params['param'], best_params['seasonal_param'], seasonal_period)
# #finding best params for seasional arima SARIMA model
best_params_seasonal = find_best_arima_params(train_1041, seasonal_period=7) # Assuming weekly data
best_params
(2, 0, 3)
#fir arima model
arima_model = ARIMA(train_1041, order = (2, 0, 3))
arima_fit = arima_model.fit()
/home/labuser/.local/lib/python3.8/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
# Make predictions with ARIMA model
pred_1041 = arima_fit.forecast(steps=len(test_1041))
pred_1041.values
array([4939.99952863, 4844.44435978, 4890.61845183, 4917.64842079,
4875.4840782 , 4856.06979371, 4894.09393576, 4907.33507809,
4873.43226015, 4865.12542496, 4895.03570539, 4899.46189956,
4873.33767339, 4871.90475379, 4894.50111707, 4893.68041712,
4874.3226779 , 4876.78719126, 4893.210448 , 4889.59890659,
4875.8034432 , 4880.16265376, 4891.63035217])
# Fit SARIMA model
sarima_model = SARIMAX(train_1041, order=(2, 0, 3), seasonal_order=(2, 0, 3, 7))
sarima_fit = sarima_model.fit(disp = False)
# Make predictions with SARIMA model
pred_sarima_1041 = sarima_fit.forecast(steps=len(test_1041))
/home/labuser/.local/lib/python3.8/site-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
/home/labuser/.local/lib/python3.8/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
/home/labuser/.local/lib/python3.8/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
pred_sarima_1041
2016-01-03 4829.138346 2016-01-10 5035.553351 2016-01-17 4853.526205 2016-01-24 5132.517825 2016-01-31 4999.550003 2016-02-07 4845.135448 2016-02-14 5125.420443 2016-02-21 5035.588350 2016-02-28 4952.903837 2016-03-06 5077.261292 2016-03-13 4940.618739 2016-03-20 5054.516327 2016-03-27 4937.835967 2016-04-03 5052.467668 2016-04-10 4960.451796 2016-04-17 5029.228773 2016-04-24 4967.516190 2016-05-01 5028.122745 2016-05-08 4949.632081 2016-05-15 5042.530505 2016-05-22 4981.433882 2016-05-29 5017.472990 2016-06-05 4960.202921 Freq: W-SUN, Name: predicted_mean, dtype: float64
plt.figure(figsize=(10, 4))
plt.plot(test_1041, label = "1041 Test")
plt.title("Weekly Change of Customers for Test Store 1041")
plt.xlabel("Date")
plt.ylabel("Number of Customers")
plt.legend()
plt.show()
plt.figure(figsize=(10, 4))
# Plotting the predictions from ARIMA model
plt.plot(pred_1041, label='ARIMA Predictions', color='green')
# Plotting the predictions from SARIMA model
plt.plot(pred_sarima_1041, label='SARIMA Predictions', color='red')
plt.title("Weekly Change of Customers based on ARIMA/SARIMA Predictions")
plt.xlabel("Date")
plt.ylabel("Number of Customers")
plt.legend()
plt.show()
def timeseries_evaluation_metrics_func(y_true, y_pred):
# code starts here
"""
Calculate evaluation metrics for time series prediction.
Parameters:
y_true : array-like
Array containing the true values.
y_pred : array-like
Array containing the predicted values.
"""
# Calculate evaluation metrics
mse = np.mean((y_pred - y_true) ** 2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y_pred - y_true))
mape = np.mean(np.abs((y_pred - y_true) / y_true)) * 100
# Print evaluation metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("Mean Absolute Percentage Error:", mape)
# code ends here
Evaluation metrics for ARIMA model
#get the evaluation for arima model predictions
timeseries_evaluation_metrics_func(test_1041['Store_1041'],pred_1041)
Mean Squared Error: 3511434.0311806826 Root Mean Squared Error: 1873.8820750465284 Mean Absolute Error: 1290.3832600139126 Mean Absolute Percentage Error: 89.16062677593794
Evaluation metrics for SARIMA model
#get the evaluation metrics for the sarima model predictions
timeseries_evaluation_metrics_func(test_1041['Store_1041'], pred_sarima_1041)
Mean Squared Error: 3723735.922870717 Root Mean Squared Error: 1929.6984020490656 Mean Absolute Error: 1332.1612103118368 Mean Absolute Percentage Error: 91.69256485284834
from omnixai.data.timeseries import Timeseries
from omnixai.explainers.timeseries import ShapTimeseries
from omnixai.explainers.timeseries import TimeseriesExplainer
IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
# threshold for detecting anomaly data points
threshold = np.percentile(train_1041["Store_1041"].values, 90)
print("Threshold: ", threshold)
Threshold: 5821.6
# detector for determining whether a window of time series is anomalous
def detector(ts: Timeseries):
anomaly_scores = np.sum((ts.values > threshold).astype(int))
return anomaly_scores / ts.shape[0]
#SHAP generates local explanations
explainer = ShapTimeseries(
training_data=Timeseries.from_pd(train_1041),
predict_function=detector,
mode="anomaly_detection"
)
# Generate explanations
test_x = Timeseries.from_pd(test_1041)
explanations = explainer.explain(test_x)
print("SHAP results:")
explanations.ipython_plot()
100%|█████████████████████████████████████████████| 1/1 [00:04<00:00, 4.87s/it]
SHAP results:
#Save the models into local directory
# Save ARIMA model
with open("models/ARIMA_Model_Store1041.t5", "wb") as f:
pickle.dump(arima_fit, f)
# Save SARIMA model
with open("models/SARIMA_Model_Store1041.t5", "wb") as f:
pickle.dump(sarima_fit, f)
import unittest
import streamlit as st
# Function to load ARIMA model
def load_arima_model(file_path):
try:
with open(file_path, "rb") as f:
arima_model = pickle.load(f)
return arima_model
except Exception as e:
print(e)
return None
# Function to load SARIMA model
def load_sarima_model(file_path):
try:
with open(file_path, "rb") as f:
sarima_model = pickle.load(f)
return sarima_model
except Exception as e:
print(e)
return None
#load the ARIMA model
arima_model = load_arima_model("models/ARIMA_Model_Store1041.t5")
#load the SARIMA model
sarima_model = load_arima_model("models/SARIMA_Model_Store1041.t5")
#arima model loading test case
def test_arima_model_loading(model_path):
# Load ARIMA model
arima_model = load_arima_model(model_path)
# Test model loading
assert arima_model is not None, "ARIMA model loading failed."
#sarima model loading test case
def test_sarima_model_loading(model_path):
# Load ARIMA model
sarima_model = load_sarima_model(model_path)
# Test model loading
assert sarima_model is not None, "SARIMA model loading failed."
test_arima_model_loading("models/ARIMA_Model_Store1041.t5")
test_sarima_model_loading("models/SARIMA_Model_Store1041.t5")
#create unit test on model predictions
def test_model_predictions(model_object):
# Generate sample test data, number of steps
test_data = pd.DataFrame(np.random.randn(10, 1), columns=['Value'])
# Test prediction accuracy
prediction = model_object.forecast(steps=len(test_data))
assert len(prediction) == len(test_data), "Incorrect number of predictions."
# Add more assertions as needed
#arima model test predictions
test_model_predictions(arima_model)
#sarima model test predictions
test_model_predictions(sarima_model)
#check if there any non zero predictions
def test_arima_non_zero_prediction(model_file):
# Generate sample test data
test_data = pd.DataFrame(np.random.randn(10, 1), columns=['Value'])
# Test prediction range
prediction = model_file.forecast(steps=len(test_data))
assert not np.any(prediction.values <= 0), "ARIMA prediction values cann't be less than zero."
def test_sarima_non_zero_prediction(model_file):
# Generate sample test data
test_data = pd.DataFrame(np.random.randn(10, 1), columns=['Value'])
# Test prediction range
prediction = model_file.forecast(steps=len(test_data))
assert not np.any(prediction <= 0), "SARIMA prediction values cann't be less than zero."
#check for any non zero predictions
test_arima_non_zero_prediction(arima_model)
test_sarima_non_zero_prediction(sarima_model)
#model deployment is done using streamlit
Final Submission guidelines:
-------------------------------------------------- ASSESSMENT ENDS HERE ---------------------------------------------------------